**** This dofile is to clean the dataset of grade 12 students at endline

clear all
capture log close
set more off
set varabbrev off

log using "$logfile/build_grade12students_endline.log", replace

// 
gl endlinecsv $raw/student/grade1112/20220404174151-SurveyExport.csv
clear
import delimited "$endlinecsv", clear

* remove invalid responses
drop if thissurveyisintendedforgradexiis=="No, I am not a Grade XII student."

* rename personal information variables
cap ren whatisyourname name_original_e
lab var name_original_e "name reported at endline survey"
cap ren whatisyourgender sex
lab val sex sex 
recode sex (2=0)
lab define sex 1 "Male" 0 "Female"
cap ren whatisyourcitizenshipidcode11dig citizenid
format citizenid %15s
replace citizenid = subinstr(citizenid,".","",.)  
replace citizenid = subinstr(citizenid,"]","",.)  
replace citizenid = subinstr(citizenid,"/","",.) 
replace citizenid = trim(itrim(lower(citizenid))) 
replace citizenid = subinstr(citizenid,"na","",.) 
format citizenid %11s
replace citizenid ="" if inlist(citizenid, "00000000000","12345678910", "01234567890", "12345678902", "12345678901", "12345678912", "12345678911")
replace citizenid ="" if strlen(citizenid)~=11
cap ren inwhichdzongkhagisyourschoolloca district
cap ren dateofbirthdaywhatisyourdateofbi b_day
cap ren dateofbirthmonthwhatisyourdateof b_month
cap ren dateofbirthyearwhatisyourdateofb b_year
cap ren whatisyourstudentidcode studentid
cap ren whatisyouremailaddressprovidedby email
cap ren whatisyourprivateemailaddress email_private
cap ren whatisyourmobilephonenumber phone
cap ren whatisyouracademicstream stream
replace district = strlower(district)

* format name
replace name_original_e=lower(name_original_e)
replace name_original_e = subinstr(name_original_e,"@gemailcom","",.)   //Removes irrelevant info
replace name_original_e = subinstr(name_original_e,"@gmailcom","",.)   //Removes irrelevant info
g name = name_original_e
replace name = trim(itrim(lower(name)))
replace name = subinstr(name," ","",.)  
replace name = subinstr(name,",","",.)   //Removes comma (,)
replace name = subinstr(name,"'","",.)   //Removes apostrophe (')
replace name = subinstr(name,".","",.)   //Removes dot (.) 
replace name = subinstr(name,"/","",.)   //Removes slash (/)
replace name = subinstr(name,"-","",.)   //Removes dash (-)
replace name = subinstr(name,"=","",.)   //Removes dash (-)
replace name = subinstr(name,"(","",.)   //Removes opening parentheses
replace name = subinstr(name,")","",.)   //Removes closing parentheses
replace name = subinstr(name,"mynameis","",.)   //Removes irrelevant info
replace name = subinstr(name,"@gmailcom","",.)   //Removes irrelevant info
replace name = subinstr(name,"@gemailcom","",.)   //Removes irrelevant info
format name %25s
format name_original_e %30s

* format studentid 
replace studentid = trim(itrim(lower(studentid)))
replace studentid = subinstr(studentid,"..",".",.)  
replace studentid = subinstr(studentid," ","",.)  
replace studentid = subinstr(studentid,"@education.gov.bt","",.)   //Removes irrelevant info
replace studentid = subinstr(studentid,"@education.gov. t","",.) 
replace studentid = subinstr(studentid,"ation.gov.bt","",.) 
replace studentid = subinstr(studentid,"@educ","",.) 
replace studentid = subinstr(studentid,"@.educ","",.)
replace studentid = subinstr(studentid,"ationgov.bt","",.)  
br responseid studentid if strlen(studentid)~=17 & ~missing(studentid)
replace studentid="" if inlist(responseid, 4152, 4516) | studentid=="na"
replace studentid="201.00342.14.0027" if studentid=="201.00342.14.0027."
replace studentid="201.00272.11.0044" if studentid=="201.00272.11.0044``201.00272.11.0044"

* create a variable of school name
rename whatisthenameofyourschool schoolname
tostring v40, replace
replace v40="" if v40=="."
forval i=33/41 {
	replace schoolname = v`i' if missing(schoolname) & ~missing(v`i')
}	
replace schoolname = strlower(schoolname)

* for partial submission: keep reponses with sufficient information to determine students 
lab var status "survey status"
keep if status=="Complete" | ///
       (status=="Partial" & ///
	   (~((missing(name_original_e) | strlen(name_original_e)<=2) & missing(studentid)) | ///
	    ~((missing(name_original_e) | strlen(name_original_e)<=2) & missing(sex) & missing(b_month) & missing(b_year) & missing(b_day))))
		
* generate variables for treatment reports in the endline surveyed
g treat_claim = didyouparticipateinthementorship==1
lab var treat_claim "students claimed to have been participating in the program"

rename howmanytimesdidyoumeetyourmentor treat_meet_person
rename v1272 treat_meet_online

rename overallsatisfactionwiththementor treat_satisfaction_mentor 
rename overallsatisfactionwiththeconten treat_satisfaction_program 

rename howlikelythatyouwillstayincontac treat_contact

rename doyouknowthatsomestudentsinyours untreated_tschool
recode untreated_tschool (2=0)
rename haveyoutalkedwithanyofyourclassm untreated_talk
rename ifthementoringhadbeenavailableto untreated_interest

rename didyouhearaboutthestudentmentori control_cschool
recode control_cschool (2=0)
rename haveyoutalkedwithanyonewhoreceiv control_talk
rename v1283 control_interest

recode untreated_interest control_interest (2=0)
  
* KEEP ONLY RELEVANT INFORMATION FOR NOW
keep responseid schoolname name* sex citizenid district b_* studentid email* phone stream treat_* control* untreated* status
replace phone = strlower(phone) 
replace phone = subinstr(phone,"na","",.)  
rename phone phone_e

g bday_e = b_year*10^4 + b_month*10^2 + b_day
tostring bday_e, replace 

// Check for duplicated and invalid studentid 
duplicates tag studentid, g(dup_studentid)
sort studentid status responseid

* duplicated answers, choose the earlier submission
forval i=1/3{
drop if name==name[_n-1] & studentid==studentid[_n-1] & citizenid==citizenid[_n-1] & dup_studentid~=0
drop if name==name[_n-1] & b_day==b_day[_n-1] & b_month==b_month[_n-1] & b_year==b_year[_n-1] & sex==sex[_n-1] & stream==stream[_n-1] & dup_studentid~=0 
}

drop dup_studentid
duplicates tag studentid, g(dup_studentid)

* for invalid studentID and citizenID: attempt to recover studentid later on
sort studentid responseid
rename responseid responseid_e
rename status status_e

preserve 
keep if inlist(studentid, "000.00000.00.0000", "111.22222.33.4444") | strlen(studentid)~=17 | (missing(citizenid) & dup_studentid~=0)
drop dup_studentid
save "$temp/g12_p0e3.dta", replace 
restore 

drop if inlist(studentid, "000.00000.00.0000", "111.22222.33.4444") | strlen(studentid)~=17 | (missing(citizenid) & dup_studentid~=0)

* save as two datasets: with unique ID and non-unique ID
preserve 
keep if dup_studentid==0
drop dup_studentid
save "$temp/g12_p0e1.dta", replace
restore 

* save those without unique id 
keep if dup_studentid~=0
sort studentid citizenid status_e
drop if name==name[_n-1] & studentid==studentid[_n-1] & citizenid==citizenid[_n-1] 
drop if b_day==b_day[_n-1] & b_month==b_month[_n-1] & b_year==b_year[_n-1] & sex==sex[_n-1] & studentid==studentid[_n-1] 

drop if inlist(responseid_e, 724, 739, 9798, 9957)
drop dup_studentid
duplicates tag studentid, g(dup_studentid)

preserve 
keep if dup_studentid==0
drop dup_studentid
append using "$temp/g12_p0e1.dta"
save "$temp/g12_p0e1.dta", replace 
restore 

drop if dup_studentid==0
save "$temp/g12_p0e2.dta", replace





